*-------------------------------------------------------------------------------
/* title : Trim raw firm-level data for estimation
   source: ORBIS-Amadeus (proprietary firm-level data)	 
   input : $temp/Raw.dta
   output: $temp/Sample.dta 
		   $temp/Selection_criteria */
*-------------------------------------------------------------------------------

*===============================================================================
* Set up
*===============================================================================
*--------------------------SET-UP & READ DIRECTORIES----------------------------
cd												// Current directory of code
do "[0.0] directories.do"

*-------------------------------MACROS FOR DATA---------------------------------
global indlev	CPA								// Industry classification: nace2 (nace rev.2 2-digit); other, e.g. A38/CPA intermediate aggregation
global pbacon 	30 								// (1-#%)th percentile cutoff of chi-squared BACON outlier distribution; 15 (default): http://www.stata-journal.com/article.html?article=st0197

*===============================================================================
* Trim outliers using BACON procedure (drop observations)
*===============================================================================
use "$temp/Raw.dta", clear

*---------------------Statistics for Bacon Trimming before----------------------
preserve
	local criterion bacon$pbacon
	xtsum SALES
	gen N0_`criterion' = r(N) 				// #of observations
	gen F0_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y0_`criterion' = r(sum) 			// sales
	sum L
	gen L0_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU"
	save "$temp/Selection_criteria_bacon.dta", replace
restore

local variable 	SHARE y k l m 				// Variables considered in estimation process also used to trim data

* Trim per industry
glevelsof $indlev, local(industries)
foreach i of local industries {
	qui bacon `variable' if $indlev==`i', gen(outbacon) percentile(0.$pbacon)
	drop if outbacon==1
	drop outbacon
}

* Trim whole sample
qui bacon `variable', gen(outbacon) percentile(0.$pbacon)
drop if outbacon==1
drop outbacon

*---------------------Statistics for Bacon Trimming after----------------------
preserve
	local criterion bacon$pbacon
	xtsum SALES
	gen N1_`criterion' = r(N) 				// #of observations
	gen F1_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y1_`criterion' = r(sum) 			// sales
	sum L
	gen L1_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU" 
	merge 1:1 country using "$temp/Selection_criteria_bacon.dta", nogen
	save "$temp/Selection_criteria_bacon.dta", replace
restore

*===============================================================================
* Keep firms with at least 3 consecutive observations (could still have gaps)
*===============================================================================
*-------------------------------Statistics before-------------------------------
preserve
	local criterion drop3bacon$pbacon
	xtsum SALES
	gen N0_`criterion' = r(N) 				// #of observations
	gen F0_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y0_`criterion' = r(sum) 			// sales
	sum L
	gen L0_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU" 
	merge 1:1 country using "$temp/Selection_criteria_bacon.dta", nogen
	save "$temp/Selection_criteria_bacon.dta", replace
restore

* Drop firms with less than 3 consecutive observations
tsset newid year
rangestat (count) year, interval(year -2 0) by(newid)
bysort newid (year_count): gen to_drop = year_count[_N] < 3
drop if to_drop==1
cap drop year_count to_drop

*--------------------------------Statistics after-------------------------------
preserve
	local criterion drop3bacon$pbacon
	xtsum SALES
	gen N1_`criterion' = r(N) 				// #of observations
	gen F1_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y1_`criterion' = r(sum) 			// sales
	sum L
	gen L1_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU" 
	merge 1:1 country using "$temp/Selection_criteria_bacon.dta", nogen
	save "$temp/Selection_criteria_bacon.dta", replace
restore

*===============================================================================
* Save estimation sample
*===============================================================================
xtset newid year
qui compress
saveold "$temp/Sample.dta", replace v(13)

*===============================================================================
* Other trimming cuttoffs
*===============================================================================
use "$temp/Raw.dta", clear 

global pbacon 	15 							// (1-#%)th percentile cutoff of chi-squared BACON outlier distribution; 15 (default): http://www.stata-journal.com/article.html?article=st0197

local variable 	SHARE y k l m 				// Variables considered in estimation process also used to trim data

* Trim per industry
glevelsof $indlev, local(industries)
foreach i of local industries {
	qui bacon `variable' if $indlev==`i', gen(outbacon) percentile(0.$pbacon)
	drop if outbacon==1
	drop outbacon
}

* Trim whole sample
qui bacon `variable', gen(outbacon) percentile(0.$pbacon)
drop if outbacon==1
drop outbacon

*---------------------Statistics for Bacon Trimming after----------------------
preserve
	local criterion bacon$pbacon
	xtsum SALES
	gen N1_`criterion' = r(N) 				// #of observations
	gen F1_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y1_`criterion' = r(sum) 			// sales
	sum L
	gen L1_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU" 
	merge 1:1 country using "$temp/Selection_criteria_bacon.dta",nogen
	save "$temp/Selection_criteria_bacon.dta", replace
restore

*===============================================================================
* Keep firms with at least 3 consecutive observations (could still have gaps)
*===============================================================================
*-----------------------------Statistics before-----------------------------
preserve
	local criterion drop3bacon$pbacon
	xtsum SALES
	gen N0_`criterion' = r(N) 				// #of observations
	gen F0_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y0_`criterion' = r(sum) 			// sales
	sum L
	gen L0_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU"
	merge 1:1 country using "$temp/Selection_criteria_bacon.dta", nogen
	save "$temp/Selection_criteria_bacon.dta", replace
restore

* Drop firms with less than 3 consecutive observations
tsset newid year
rangestat (count) year, interval(year -2 0) by(newid)
bysort newid (year_count): gen to_drop = year_count[_N] < 3
drop if to_drop==1
cap drop year_count to_drop

*------------------------------Statistics after-----------------------------
preserve
	local criterion drop3bacon$pbacon
	xtsum SALES
	gen N1_`criterion' = r(N) 				// #of observations
	gen F1_`criterion' = r(n) 				// #of firms
	sum SALES
	gen Y1_`criterion' = r(sum) 			// sales
	sum L
	gen L1_`criterion' = r(sum) 			// labour units
	keep country *_`criterion'
	keep if _n==1
	replace country = "EU" 
	merge 1:1 country using "$temp/Selection_criteria_bacon.dta", nogen
	save "$temp/Selection_criteria_bacon.dta", replace
restore

*===============================================================================
* Save cleaned dataset
*===============================================================================
xtset newid year
qui compress
saveold "$temp/Sample${pbacon}.dta", replace v(13)

* Merge files
use "$temp/Selection_criteria.dta", clear
merge 1:1 country using "$temp/Selection_criteria_bacon.dta",nogen
order country
save "$temp/Selection_criteria.dta", replace

* Erase files
shell erase "$temp\Selection_criteria_bacon.dta"
